library(readxl)
library(readr)
library(dplyr)
library(here)
library(tidyverse)
sheet1 <- read_xlsx(here("raw_data/boing-boing-candy-2015.xlsx"))
sheet1
dim(sheet1)
colnames(sheet1)
glimpse(sheet1)
sheet1_long <- sheet1 %>%
pivot_longer(cols = starts_with("["), #converting rows to columns
names_to = "candy",
values_to = "rating")
#check dimensions, column names
dim(sheet1_long)
colnames(sheet1_long)
#Omitting columns that are not required
sheet1_long <- sheet1_long %>%
select(-c(4:29))
colnames(sheet1_long)
[1] "Timestamp" "How old are you?" "Are you going actually going trick or treating yourself?"
[4] "candy" "rating"
dim(sheet1_long)
[1] 534850 5
colnames(sheet1_long) <- c("timestamp", "age", "going_out", "candy", "rating")
colnames(sheet1_long)
[1] "timestamp" "age" "going_out" "candy" "rating"
#Add column ‘year’
sheet1_long <- sheet1_long %>%
add_column(year = 2015)
head(sheet1_long)
sheet2 <- read_xlsx(here("raw_data/boing-boing-candy-2016.xlsx"))
sheet2
dim(sheet2)
colnames(sheet2)
glimpse(sheet2)
sheet2_long <- sheet2 %>%
pivot_longer(cols = starts_with("["), #converting rows to columns
names_to = "candy",
values_to = "rating")
dim(sheet2_long)
colnames(sheet2_long)
#Omitting columns that are not required
sheet2_long <- sheet2_long %>%
select(-c(6:22))
colnames(sheet2_long)
[1] "Timestamp" "Are you going actually going trick or treating yourself?" "Your gender:"
[4] "How old are you?" "Which country do you live in?" "candy"
[7] "rating"
dim(sheet2_long)
[1] 127159 7
colnames(sheet2_long) <- c("timestamp", "going_out", "gender", "age", "country", "candy", "rating")
colnames(sheet2_long)
[1] "timestamp" "going_out" "gender" "age" "country" "candy" "rating"
#Add column ‘year’
sheet2_long <- sheet2_long %>%
add_column(year = 2016)
head(sheet2_long)
sheet3 <- read_xlsx(here("raw_data/boing-boing-candy-2017.xlsx"))
sheet3
dim(sheet3)
colnames(sheet3)
glimpse(sheet3)
sheet3_long <- sheet3 %>%
pivot_longer(cols = starts_with("Q6"), #converting rows to columns
names_to = "candy",
values_to = "rating")
colnames(sheet3_long)
[1] "Internal ID" "Q1: GOING OUT?" "Q2: GENDER" "Q3: AGE" "Q4: COUNTRY"
[6] "Q5: STATE, PROVINCE, COUNTY, ETC" "Q7: JOY OTHER" "Q8: DESPAIR OTHER" "Q9: OTHER COMMENTS" "Q10: DRESS"
[11] "...114" "Q11: DAY" "Q12: MEDIA [Daily Dish]" "Q12: MEDIA [Science]" "Q12: MEDIA [ESPN]"
[16] "Q12: MEDIA [Yahoo]" "Click Coordinates (x, y)" "candy" "rating"
#Omitting columns that are not required
sheet3_long <- sheet3_long %>%
select(-c(6:17))
colnames(sheet3_long)
[1] "Internal ID" "Q1: GOING OUT?" "Q2: GENDER" "Q3: AGE" "Q4: COUNTRY" "candy" "rating"
dim(sheet3_long)
[1] 253380 7
#Change col names
colnames(sheet3_long) <- c("internal_id", "going_out", "gender", "age", "country", "candy", "rating")
colnames(sheet3_long)
[1] "internal_id" "going_out" "gender" "age" "country" "candy" "rating"
sheet3_long
#summarise((count = sum(is.na(rating))))
#Add column ‘year’
sheet3_long <- sheet3_long %>%
add_column(year = 2017)
head(sheet3_long)
#X X X X X X X X X X X X X X X X X X X X X X X X X X X X X X
candy_combined <- bind_rows(sheet1_long, sheet2_long, sheet3_long)
colnames(candy_combined)
[1] "timestamp" "age" "going_out" "candy" "rating" "year" "gender" "country" "internal_id"
dim(candy_combined)
[1] 915389 9
head(candy_combined)
write.xlsx(here(candy_combined, file = 'raw_data/candy_combined.xlsx', colNames = TRUE))
Error in saveWorkbook(wb = wb, file = file, overwrite = overwrite) :
argument "file" is missing, with no default
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX xXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXxx
#Read in combined data file
candy_clean <- read_csv(here("raw_data/candy_combined.csv"))
Parsed with column specification:
cols(
timestamp = [34mcol_datetime(format = "")[39m,
age = [32mcol_double()[39m,
going_out = [31mcol_character()[39m,
candy = [31mcol_character()[39m,
rating = [31mcol_character()[39m,
year = [32mcol_double()[39m,
gender = [33mcol_logical()[39m,
country = [33mcol_logical()[39m,
internal_id = [33mcol_logical()[39m
)
1014402 parsing failures.
row col expected actual file
5131 age no trailing characters 's '/Users/user/Codeclan_work/Dirty_Data_Project/task4/raw_data/candy_combined.csv'
5132 age no trailing characters 's '/Users/user/Codeclan_work/Dirty_Data_Project/task4/raw_data/candy_combined.csv'
5133 age no trailing characters 's '/Users/user/Codeclan_work/Dirty_Data_Project/task4/raw_data/candy_combined.csv'
5134 age no trailing characters 's '/Users/user/Codeclan_work/Dirty_Data_Project/task4/raw_data/candy_combined.csv'
5135 age no trailing characters 's '/Users/user/Codeclan_work/Dirty_Data_Project/task4/raw_data/candy_combined.csv'
.... ... ...................... ...... ................................................................................
See problems(...) for more details.
head(candy_clean)
#Changing column data type
candy_clean$age <- as.integer(candy_clean$age)
Warning messages:
1: Unknown or uninitialised column: `head`.
2: Unknown or uninitialised column: `head`.
candy_clean$year <- as.integer(candy_clean$year)
candy_clean$gender <- as.character(candy_clean$gender)
candy_clean$country <- as.character(candy_clean$country)
candy_clean$internal_id <- as.integer(candy_clean$internal_id)
head(candy_clean)
NA